Read data from Cluster Analysis Input Data.xlsx. From examining the file via Excel, we see that the data is located in the tab named “Database”. We call that tab in the read_xlsx function.
cluster_dat <- readxl::read_xlsx("Cluster Analysis Input Data.xlsx",sheet = 'Database')
Examine cluster_dat.
str(cluster_dat)
## Classes 'tbl_df', 'tbl' and 'data.frame': 1825 obs. of 23 variables:
## $ Time Bin : POSIXct, format: "2017-01-01 00:00:00" "2017-01-01 05:00:00" ...
## $ X__1 : POSIXct, format: "2017-01-01" "2017-01-01" ...
## $ X__2 : POSIXct, format: "2017-01-01" "2017-01-01" ...
## $ X__3 : POSIXct, format: "1899-12-31 00:00:00" "1899-12-31 05:00:00" ...
## $ X__4 : num 22 23 24 25 26 22 23 24 25 26 ...
## $ Density : num 2.25 2.64 8.93 9.7 4.33 ...
## $ X__5 : num 17 18 19 20 21 17 18 19 20 21 ...
## $ Volume : num 68531 86294 239073 318088 136445 ...
## $ X__6 : num 27 28 29 30 31 27 28 29 30 31 ...
## $ Speed : num 72.7 74.4 75.4 74.1 73.3 ...
## $ Light Snow : POSIXct, format: "1899-12-31 00:00:00" "1899-12-31 00:00:00" ...
## $ Moderate Snow : POSIXct, format: "1899-12-31 00:00:00" "1899-12-31 00:00:00" ...
## $ Heavy Snow : POSIXct, format: "1899-12-31 00:00:00" "1899-12-31 00:00:00" ...
## $ Light Rain : POSIXct, format: "1899-12-31 00:00:00" "1899-12-31 00:00:00" ...
## $ Moderate Rain : POSIXct, format: "1899-12-31 00:00:00" "1899-12-31 00:00:00" ...
## $ Heavy Rain : POSIXct, format: "1899-12-31 00:00:00" "1899-12-31 00:00:00" ...
## $ Incident Count : num 0 0 2 1 0 0 2 0 5 0 ...
## $ Incident Duration: POSIXct, format: "1899-12-31 00:00:00" "1899-12-31 00:00:00" ...
## $ Incident Impact : num 0 0 2.5 1.5 0 0 2 0 8.5 0 ...
## $ Roadwork Count : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Roadwork Duration: POSIXct, format: "1899-12-31 00:00:00" "1899-12-31 00:00:00" ...
## $ Roadwork Impact : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Week : num 1 1 1 1 1 1 1 1 1 1 ...
First, we clean up the column headers by removing any potential trailing or leading spaces and replacing special characters in the strings with ’_’.
colnames(cluster_dat) <-trimws(colnames(cluster_dat))
colnames(cluster_dat) <-gsub('([[:punct:]])|\\s+','_',colnames(cluster_dat))
colnames(cluster_dat)
## [1] "Time_Bin" "X__1" "X__2"
## [4] "X__3" "X__4" "Density"
## [7] "X__5" "Volume" "X__6"
## [10] "Speed" "Light_Snow" "Moderate_Snow"
## [13] "Heavy_Snow" "Light_Rain" "Moderate_Rain"
## [16] "Heavy_Rain" "Incident_Count" "Incident_Duration"
## [19] "Incident_Impact" "Roadwork_Count" "Roadwork_Duration"
## [22] "Roadwork_Impact" "Week"
We look to fix the DateTime fields. The first five columns looks to be parsed versions of a DateTime. The ‘Time Bin’ looks to be the most complete DateTime variable. Examine further…
class(cluster_dat$Time_Bin)
## [1] "POSIXct" "POSIXt"
cluster_dat$Time_Bin <- as.POSIXct(as.character(cluster_dat$Time_Bin),tz="US/Central")
summary(cluster_dat$Time_Bin)
## Min. 1st Qu. Median
## "2017-01-01 00:00:00" "2017-04-02 05:00:00" "2017-07-02 10:00:00"
## Mean 3rd Qu. Max.
## "2017-07-02 09:56:52" "2017-10-01 14:00:00" "2017-12-31 19:00:00"
table(is.na(cluster_dat$Time_Bin))
##
## FALSE
## 1825
Everything seems to be in order. But we will keep an eye out for potential errors.
The last column is labelled week. Explore:
summary(cluster_dat$Week)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 14.00 27.00 26.57 40.00 53.00
Week 53 does not look right. Lets investigate further.
cluster_dat[cluster_dat$Week==53,c(1:5,length(cluster_dat))]
Week 53 represents the last day of the year. We will ignore this field.
summary(cluster_dat$Incident_Count)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 1.000 2.234 3.000 30.000
hist(cluster_dat$Incident_Count,breaks = 30)
Majority of days have no incidents.
Explore the content of Incident Impact and Incident Count.
summary(cluster_dat$Incident_Impact)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 1.000 2.877 4.000 38.000
par(mfrow=c(2,2))
hist(cluster_dat$Incident_Impact,breaks=seq(0,40,1),
xlab='Incident_Impact')
plot(cluster_dat$Incident_Impact[cluster_dat$Incident_Count!=0],cluster_dat$Incident_Count[cluster_dat$Incident_Count!=0],
xlab="Incident_Impact",ylab='Incident_Count')
plot(cluster_dat$Time_Bin[cluster_dat$Incident_Count!=0],cluster_dat$Incident_Count[cluster_dat$Incident_Count!=0],
xlab="Time",ylab='Incident_Count')
plot(cluster_dat$Density[cluster_dat$Incident_Count!=0],cluster_dat$Incident_Count[cluster_dat$Incident_Count!=0],
xlab="Density",ylab='Incident_Count')
It appears that the incident count measures the amount on incidents in a specific time bin. The incident impact tracks how many impacts happened in a specific time_bin. This will allow us to determine how bad each time_bin is in terms of incidents.
There are no easily detectable patterns when plotting incidents vs time but we do see a strong positive correlation between Density and Incidents counts. This makes sense as intuitively the more cars, the more incidents.
We will focus on Incident Duration as a variable. We will add a variable ‘Incident’ that is a total incident minutes for each time bin.
##create a total sum value for minutes of incidents in each time bin.
cluster_dat_incident_melt <- reshape2::melt(cluster_dat,id.vars="Time_Bin",measure.vars='Incident_Duration')%>%
mutate(Hrs = as.numeric(format(value,"%H")),
Mins = as.numeric(format(value,"%M")))%>%
mutate(total = Hrs*60+Mins)%>%
group_by(Time_Bin)%>%
summarise(sum = sum(total))
cluster_dat$Incident <- cluster_dat_incident_melt$sum
summary(cluster_dat$Roadwork_Count)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1118 0.0000 7.0000
table(cluster_dat$Roadwork_Count)
##
## 0 1 2 3 4 5 7
## 1689 97 24 7 4 3 1
par(mfrow=c(2,2))
hist(cluster_dat$Roadwork_Count[cluster_dat$Roadwork_Count!=0],xlab='Roadwork_Impact',main='Roadwork_Count')
plot(cluster_dat$Roadwork_Count[cluster_dat$Roadwork_Count!=0],cluster_dat$Roadwork_Impact[cluster_dat$Roadwork_Count!=0],
xlab="Roadwork_Count",ylab='Roadwork_Impact')
plot(cluster_dat$Time_Bin[cluster_dat$Roadwork_Count!=0],cluster_dat$Roadwork_Count[cluster_dat$Roadwork_Count!=0],
xlab="Time",ylab='Roadwork_Count')
plot(cluster_dat$Density[cluster_dat$Roadwork_Count!=0],cluster_dat$Roadwork_Count[cluster_dat$Roadwork_Count!=0],
xlab="Density",ylab='Roadwork_Count')
x <- as.numeric(format(cluster_dat$Roadwork_Duration,"%H"))
x <- x[x>0]
table(x)
## x
## 1 2 3 4 5 6 7 8 9 10 11 22
## 27 11 14 9 6 3 2 3 1 1 1 1
Majority of days have no Roadwork. One really bad day with 7. No obvious outliers.
Lets look at the Density, Volume, Speed variables. Again there are columns without names mixed in between these fields. We will ignore the unnamed variables.
cluster_dat <- cluster_dat %>%
mutate(Hour = format(Time_Bin,"%H"),
Minute = format(Time_Bin,"%M"),
Weekday = weekdays(cluster_dat$Time_Bin),
Week = lubridate::week(cluster_dat$Time_Bin),
Month = months(cluster_dat$Time_Bin),
YearDay = yday(cluster_dat$Time_Bin))%>%
filter(Hour == 14)%>%
as.data.frame()
cluster_dat$Weekday <- factor(cluster_dat$Weekday,levels=c("Sunday", "Monday",
"Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))
cluster_dat$Month <- factor(cluster_dat$Month,levels=c("January","February","March",
"April","May","June","July","August","September",
"October","November","December"),ordered=TRUE)
summary(cluster_dat$Density)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 9.19 13.57 21.41 20.09 24.59 36.69
br_dens <- seq(0,50,2)
ranges_dens = paste(head(br_dens,-1), br_dens[-1], sep=" - ")
freq_dens <- hist(cluster_dat$Density, breaks=br_dens, include.lowest=TRUE, plot=FALSE)
freq_table_dens <- data.frame(data.frame(range = ranges_dens, frequency = freq_dens$counts))
freq_table_dens[order(-freq_table_dens$frequency),]
plot(freq_dens)
The histogram is right skewed but no major outliers are present. There are also no 0 or NA values.
We now plot the Density vs Time.
ggplot(cluster_dat,aes(x=Weekday,y=Density,color=Weekday))+
geom_violin()+
geom_jitter(alpha=0.25)
Again, there do not seem to major outliers. There looks to be some patterns here.
The Density is obviously lower on weekends compared to weekdays.
summary(cluster_dat$Volume)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 296556 430715 530181 491952 547131 569488
br_vol <- seq(0,600000,20000)
ranges_vol = paste(head(br_vol,-1), br_vol[-1], sep=" - ")
freq_vol <- hist(cluster_dat$Volume, breaks=br_vol, include.lowest=TRUE, plot=FALSE)
freq_table_vol <- data.frame(data.frame(range = ranges_vol, frequency = freq_vol$counts))
freq_table_vol[order(-freq_table_vol$frequency),]
plot(freq_vol)
It appears that the bin 40,000-60,000 contains the most records.
Again some interesting patterns but no major outliers.
We move on to speed….
We will explore the Speed variable.
summary(cluster_dat$Speed)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 36.79 61.92 65.84 66.01 71.86 75.11
br_speed <- seq(30,100,2)
ranges_speed = paste(head(br_speed,-1), br_speed[-1], sep=" - ")
freq_speed <- hist(cluster_dat$Speed, breaks=br_speed, include.lowest=TRUE, plot=FALSE)
freq_table_speed <- data.frame(data.frame(range = ranges_speed, frequency = freq_speed$counts))
freq_table_speed[order(-freq_table_speed$frequency),]
plot(freq_speed)
The speed plot looks more normal in shape than the previous distributions but does appear to have a left skew. Again there are no major outliers to be worried about nor are there are any 0 or NAs.
Lets plot speed vs time
means <- aggregate(Speed~Weekday,cluster_dat,mean)
ggplot(cluster_dat,aes(x=Weekday,y=Speed))+
geom_violin(aes(color=Weekday))+
geom_jitter(aes(color=Weekday),alpha=0.25)+
stat_summary(fun.y=mean, colour="#990000", geom="point",
shape=18, size=3,show_guide = FALSE)+
geom_text(data=means, aes(label = paste('Mean:\n',round(Speed,2),sep=''), y = Speed + 2.5),size=3,fontface='bold')+
ylab('Speed mph')+
theme(legend.position="none")
## Warning: `show_guide` has been deprecated. Please use `show.legend`
## instead.
means <- aggregate(Speed~Month,cluster_dat,mean)
ggplot(cluster_dat,aes(x=Month,y=Speed))+
geom_violin(aes(color=Month))+
geom_jitter(aes(color=Month),alpha=0.25)+
stat_summary(fun.y=mean, colour="#990000", geom="point",
shape=18, size=3,show_guide = FALSE)+
geom_text(data=means, aes(label = paste('Mean:\n',round(Speed,2),sep=''), y = Speed + 2.5),color='darkred',size=3,fontface='bold')+
ylab('Speed mph')+
theme(legend.position="none")
## Warning: `show_guide` has been deprecated. Please use `show.legend`
## instead.
Looks like we have some reduced speeds in the Summer months, maybe due to construction activities. There are also some slower days in the winter months which could very easily be due to bad weather. But there are no noticeable patterns.
ggplot(cluster_dat,aes(x=Week,y=Speed,color=Weekday))+
geom_point(aplha=0.25)+
geom_smooth()
## Warning: Ignoring unknown parameters: aplha
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Again we are seeing a pattern with reduced speeds on weekends.
Lets look at public holidays:
hols_2017 <- as.POSIXct(c('2017-01-02 14:00','2017-05-29 14:00','2017-07-03 14:00','2017-07-04 14:00','2017-09-04 14:00','2017-11-23 14:00','2017-11-24 14:00','2017-12-25 14:00'))
table(cluster_dat$Time_Bin %in% hols_2017)
##
## FALSE TRUE
## 357 8
cluster_dat$hols_2017 <- cluster_dat$Time_Bin %in% hols_2017
g <- ggplot(cluster_dat,aes(x=Weekday,y=Density))+
geom_point(aes(alpha=0.25,color=hols_2017))
plotly::ggplotly(g)
s <- ggplot(cluster_dat,aes(x=Weekday,y=Speed))+
geom_point(aes(alpha=0.25,color=hols_2017))
plotly::ggplotly(s)
Public holidays seem to be the weekdays with highest speeds.
We need to generate a total minute count for weather events in each time bin.
##create a total sum value for minutes of weather events in each time bin.
cluster_dat_melt_RS <- reshape2::melt(cluster_dat,id.vars="Time_Bin",measure.vars=c("Light_Rain","Light_Snow","Moderate_Rain","Moderate_Snow","Heavy_Snow","Heavy_Rain"))%>%
mutate(Hrs = as.numeric(format(value,"%H")),
Mins = as.numeric(format(value,"%M")),
Percip = str_sub(variable,-4))%>%
mutate(total = Hrs*60+Mins)%>%
group_by(Time_Bin,Percip)%>%
summarise(sum = sum(total))%>%
dcast(Time_Bin~Percip,value.var = 'sum')
cluster_dat_melt_weather <- reshape2::melt(cluster_dat,id.vars="Time_Bin",measure.vars=colnames(cluster_dat)[11:16])%>%
mutate(Hrs = as.numeric(format(value,"%H")),
Mins = as.numeric(format(value,"%M")))%>%
mutate(total = Hrs*60+Mins)%>%
group_by(Time_Bin)%>%
summarise(sum = sum(total))
cluster_dat$weather <- cluster_dat_melt_weather$sum
cluster_dat$Rain <- cluster_dat_melt_RS$Rain
cluster_dat$Snow <- cluster_dat_melt_RS$Snow
rm(cluster_dat_melt_RS,cluster_dat_melt_weather)
Now we filter out the weekends. We already determined that they will have their own cluster.
##filter out the weekends
'%ni%' <- Negate('%in%')
cluster_dat_midweek <- cluster_dat %>%
filter(Weekday %ni% c('Saturday','Sunday'))
## K Means Clustering {.tabset .tabset-fade}
nc <- NbClust(cluster_dat_midweek%>%select(Speed,YearDay,Rain,Snow,Incident)%>%scale(), min.nc=2, max.nc=15, method="kmeans")
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 4 proposed 2 as the best number of clusters
## * 2 proposed 3 as the best number of clusters
## * 1 proposed 4 as the best number of clusters
## * 5 proposed 5 as the best number of clusters
## * 3 proposed 6 as the best number of clusters
## * 1 proposed 10 as the best number of clusters
## * 2 proposed 11 as the best number of clusters
## * 1 proposed 13 as the best number of clusters
## * 2 proposed 14 as the best number of clusters
## * 2 proposed 15 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 5
##
##
## *******************************************************************
We run the kmeans function with 5 centers and create a variable for the cluster designation. We will add this to our dataframe to facilitate plotting.
set.seed(250)
#run kmeans
speed_cluster <- kmeans(cluster_dat_midweek%>%select(Speed,YearDay,Rain,Snow,Incident)%>%scale(),centers = 5,nstart = 25)
##add cluster to fitlered dataframe
cluster_dat_midweek$cluster <- as.factor(speed_cluster$cluster)
We will look at some plots to display the clusters.
2D Plot Year Day ~ Speed
A standard 2d plot.
plotly::plot_ly(cluster_dat_midweek,x=~YearDay,y=~Speed,color=~cluster,type='scatter',mode='markers',text=~paste('Time_Bin:', format(Time_Bin,"%Y-%m-%d")))
The 2d plot does not do a good job of showing where the clusters split.
set.seed(250)
nc <- NbClust(cluster_dat_midweek%>%select(Speed,YearDay,weather)%>%scale(), min.nc=2, max.nc=15, method="kmeans")
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 3 proposed 2 as the best number of clusters
## * 5 proposed 3 as the best number of clusters
## * 7 proposed 4 as the best number of clusters
## * 1 proposed 5 as the best number of clusters
## * 2 proposed 7 as the best number of clusters
## * 2 proposed 9 as the best number of clusters
## * 1 proposed 12 as the best number of clusters
## * 1 proposed 14 as the best number of clusters
## * 1 proposed 15 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 4
##
##
## *******************************************************************
We run the kmeans function with 5 centers and create a variable for the cluster designation. We will add this to our dataframe to facilitate plotting.
set.seed(250)
#run kmeans
speed_cluster <- kmeans(cluster_dat_midweek%>%select(Speed,YearDay,weather)%>%scale(),centers = 5,nstart = 25)
##add cluster to fitlered dataframe
cluster_dat_midweek$cluster <- as.factor(speed_cluster$cluster)
We will look at some plots to display the clusters.
2D Plot Year Day ~ Speed
First a simple 2d plot
plotly::plot_ly(cluster_dat_midweek,x=~YearDay,y=~Speed,color=~cluster,type='scatter',mode='markers',text=~paste('Time_Bin:', format(Time_Bin,"%Y-%m-%d")))
The 2d plot does not do a good job of showing where the clusters split.
3D Plot Year Day ~ Speed ~ Weather
we will try a 3d plot with weather on the z axis.
#3d plot to better show
plotly::plot_ly(cluster_dat_midweek,x=~YearDay,y=~Speed,z=~weather,
color=~cluster, hoverinfo='text',
text = ~paste('Speed:', round(Speed,2), '<br>Weather mins:', weather, '<br>Time_Bin:', format(Time_Bin,"%Y-%m-%d")))%>%
add_markers()%>%
layout(scene = list(xaxis = list(title = 'Day of Year',
range = c(0,max(cluster_dat_midweek$YearDay)),autorange=F),
yaxis = list(title = 'Speed MPH',
range = c(min(cluster_dat_midweek$Speed),
max(cluster_dat_midweek$Speed))),
zaxis = list(title = 'Weather mins',
range = c(min(cluster_dat_midweek$weather),
max(cluster_dat_midweek$weather)))))
Much better.
set.seed(250)
nc <- NbClust(cluster_dat_midweek%>%select(Speed,YearDay,weather, Incident)%>%scale(), min.nc=2, max.nc=15, method="kmeans")
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 4 proposed 2 as the best number of clusters
## * 6 proposed 3 as the best number of clusters
## * 1 proposed 4 as the best number of clusters
## * 2 proposed 5 as the best number of clusters
## * 2 proposed 6 as the best number of clusters
## * 2 proposed 13 as the best number of clusters
## * 5 proposed 14 as the best number of clusters
## * 1 proposed 15 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 3
##
##
## *******************************************************************
We run the kmeans function with 5 centers and create a variable for the cluster designation. We will add this to our dataframe to facilitate plotting.
set.seed(250)
#run kmeans
speed_cluster <- kmeans(cluster_dat_midweek%>%select(Speed,YearDay,weather, Incident)%>%scale(),centers = 5,nstart = 25)
##add cluster to fitlered dataframe
cluster_dat_midweek$cluster <- as.factor(speed_cluster$cluster)
We will look at some plots to display the clusters.
#3d plot to better show
plotly::plot_ly(cluster_dat_midweek,x=~YearDay,y=~Speed,z=~weather,
color=~cluster, hoverinfo='text',
text = ~paste('Speed:', round(Speed,2), '<br>Weather mins:', weather, '<br>Time_Bin:', format(Time_Bin,"%Y-%m-%d")))%>%
add_markers()%>%
layout(scene = list(xaxis = list(title = 'Day of Year',
range = c(0,max(cluster_dat_midweek$YearDay)),autorange=F),
yaxis = list(title = 'Speed MPH',
range = c(min(cluster_dat_midweek$Speed),
max(cluster_dat_midweek$Speed))),
zaxis = list(title = 'Weather mins',
range = c(min(cluster_dat_midweek$weather),
max(cluster_dat_midweek$weather)))))
Much better.
3D Plot Year Day ~ Speed, Incidents
Now lets look at Incidents on the z axis.
#3d plot to better show
plotly::plot_ly(cluster_dat_midweek,x=~YearDay,y=~Speed,z=~Incident,
color=~cluster, hoverinfo='text',
text = ~paste('Speed:', round(Speed,2), '<br>Incidents mins:', Incident, '<br>Time_Bin:', format(Time_Bin,"%Y-%m-%d")))%>%
add_markers()%>%
layout(scene = list(xaxis = list(title = 'Day of Year',
range = c(0,max(cluster_dat_midweek$YearDay)),autorange=F),
yaxis = list(title = 'Speed MPH',
range = c(min(cluster_dat_midweek$Speed),
max(cluster_dat_midweek$Speed))),
zaxis = list(title = 'Incident mins',
range = c(min(cluster_dat_midweek$Incident),
max(cluster_dat_midweek$Incident)))))
set.seed(250)
nc <- NbClust(cluster_dat_midweek%>%select(Speed,YearDay,Rain, Snow, Incident)%>%scale(), min.nc=2, max.nc=15, method="kmeans")
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 4 proposed 2 as the best number of clusters
## * 2 proposed 3 as the best number of clusters
## * 1 proposed 4 as the best number of clusters
## * 5 proposed 5 as the best number of clusters
## * 3 proposed 6 as the best number of clusters
## * 1 proposed 10 as the best number of clusters
## * 2 proposed 11 as the best number of clusters
## * 1 proposed 13 as the best number of clusters
## * 2 proposed 14 as the best number of clusters
## * 2 proposed 15 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 5
##
##
## *******************************************************************
We run the kmeans function with 5 centers and create a variable for the cluster designation. We will add this to our dataframe to facilitate plotting.
set.seed(250)
#run kmeans
speed_cluster <- kmeans(cluster_dat_midweek%>%select(Speed,YearDay,Rain,Snow, Incident)%>%scale(),centers = 5,nstart = 25)
##add cluster to fitlered dataframe
cluster_dat_midweek$cluster <- as.factor(speed_cluster$cluster)
We will look at some plots to display the clusters.
#3d plot to better show
plotly::plot_ly(cluster_dat_midweek,x=~YearDay,y=~Speed,z=~Snow,
color=~cluster, hoverinfo='text',
text = ~paste('Speed:', round(Speed,2), '<br>Weather mins:', weather, '<br>Time_Bin:', format(Time_Bin,"%Y-%m-%d")))%>%
add_markers()%>%
layout(scene = list(xaxis = list(title = 'Day of Year',
range = c(0,max(cluster_dat_midweek$YearDay)),autorange=F),
yaxis = list(title = 'Speed MPH',
range = c(min(cluster_dat_midweek$Speed),
max(cluster_dat_midweek$Speed))),
zaxis = list(title = 'Snow mins',
range = c(min(cluster_dat_midweek$Snow),
max(cluster_dat_midweek$Snow)))))
Much better.
3D Plot Year Day ~ Speed, Incidents
Now lets look at Incidents on the z axis.
#3d plot to better show
plotly::plot_ly(cluster_dat_midweek,x=~YearDay,y=~Speed,z=~Incident,
color=~cluster, hoverinfo='text',
text = ~paste('Speed:', round(Speed,2), '<br>Incidents mins:', Incident, '<br>Time_Bin:', format(Time_Bin,"%Y-%m-%d")))%>%
add_markers()%>%
layout(scene = list(xaxis = list(title = 'Day of Year',
range = c(0,max(cluster_dat_midweek$YearDay)),autorange=F),
yaxis = list(title = 'Speed MPH',
range = c(min(cluster_dat_midweek$Speed),
max(cluster_dat_midweek$Speed))),
zaxis = list(title = 'Incident mins',
range = c(min(cluster_dat_midweek$Incident),
max(cluster_dat_midweek$Incident)))))